In [1]:
import pandas as pd
import matplotlib.pyplot as plt 
import glob
In [2]:
import warnings
warnings.filterwarnings("ignore")
import sys
!{sys.executable} -m pip install pandas-profiling
Requirement already satisfied: pandas-profiling in /usr/local/lib/python3.7/site-packages (2.3.0)
Requirement already satisfied: confuse>=1.0.0 in /usr/local/lib/python3.7/site-packages (from pandas-profiling) (1.0.0)
Requirement already satisfied: astropy in /usr/local/lib/python3.7/site-packages (from pandas-profiling) (3.2.3)
Requirement already satisfied: htmlmin>=0.1.12 in /usr/local/lib/python3.7/site-packages (from pandas-profiling) (0.1.12)
Requirement already satisfied: missingno>=0.4.2 in /usr/local/lib/python3.7/site-packages (from pandas-profiling) (0.4.2)
Requirement already satisfied: jinja2>=2.8 in /usr/local/lib/python3.7/site-packages (from pandas-profiling) (2.10.1)
Requirement already satisfied: pandas>=0.19 in /usr/local/lib/python3.7/site-packages (from pandas-profiling) (0.25.3)
Requirement already satisfied: phik>=0.9.8 in /usr/local/lib/python3.7/site-packages (from pandas-profiling) (0.9.8)
Requirement already satisfied: matplotlib>=1.4 in /usr/local/lib/python3.7/site-packages (from pandas-profiling) (3.1.1)
Requirement already satisfied: pyyaml in /usr/local/lib/python3.7/site-packages (from confuse>=1.0.0->pandas-profiling) (5.1.2)
Requirement already satisfied: numpy>=1.13 in /usr/local/lib/python3.7/site-packages (from astropy->pandas-profiling) (1.17.4)
Requirement already satisfied: seaborn in /usr/local/lib/python3.7/site-packages (from missingno>=0.4.2->pandas-profiling) (0.9.0)
Requirement already satisfied: scipy in /usr/local/lib/python3.7/site-packages (from missingno>=0.4.2->pandas-profiling) (1.3.2)
Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/site-packages (from jinja2>=2.8->pandas-profiling) (1.1.1)
Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/site-packages (from pandas>=0.19->pandas-profiling) (2019.3)
Requirement already satisfied: python-dateutil>=2.6.1 in /usr/local/lib/python3.7/site-packages (from pandas>=0.19->pandas-profiling) (2.8.0)
Requirement already satisfied: jupyter-client>=5.2.3 in /usr/local/lib/python3.7/site-packages (from phik>=0.9.8->pandas-profiling) (5.3.3)
Requirement already satisfied: pytest-pylint>=0.13.0 in /usr/local/lib/python3.7/site-packages (from phik>=0.9.8->pandas-profiling) (0.14.1)
Requirement already satisfied: nbconvert>=5.3.1 in /usr/local/lib/python3.7/site-packages (from phik>=0.9.8->pandas-profiling) (5.6.0)
Requirement already satisfied: pytest>=4.0.2 in /usr/local/lib/python3.7/site-packages (from phik>=0.9.8->pandas-profiling) (5.2.2)
Requirement already satisfied: numba>=0.38.1 in /usr/local/lib/python3.7/site-packages (from phik>=0.9.8->pandas-profiling) (0.46.0)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.7/site-packages (from matplotlib>=1.4->pandas-profiling) (2.4.5)
Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/site-packages (from matplotlib>=1.4->pandas-profiling) (1.1.0)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/site-packages (from matplotlib>=1.4->pandas-profiling) (0.10.0)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/site-packages (from python-dateutil>=2.6.1->pandas>=0.19->pandas-profiling) (1.12.0)
Requirement already satisfied: tornado>=4.1 in /usr/local/lib/python3.7/site-packages (from jupyter-client>=5.2.3->phik>=0.9.8->pandas-profiling) (6.0.3)
Requirement already satisfied: jupyter-core in /usr/local/lib/python3.7/site-packages (from jupyter-client>=5.2.3->phik>=0.9.8->pandas-profiling) (4.5.0)
Requirement already satisfied: pyzmq>=13 in /usr/local/lib/python3.7/site-packages (from jupyter-client>=5.2.3->phik>=0.9.8->pandas-profiling) (18.1.0)
Requirement already satisfied: traitlets in /usr/local/lib/python3.7/site-packages (from jupyter-client>=5.2.3->phik>=0.9.8->pandas-profiling) (4.3.2)
Requirement already satisfied: pylint>=1.4.5 in /usr/local/lib/python3.7/site-packages (from pytest-pylint>=0.13.0->phik>=0.9.8->pandas-profiling) (2.4.4)
Requirement already satisfied: pygments in /usr/local/lib/python3.7/site-packages (from nbconvert>=5.3.1->phik>=0.9.8->pandas-profiling) (2.4.2)
Requirement already satisfied: pandocfilters>=1.4.1 in /usr/local/lib/python3.7/site-packages (from nbconvert>=5.3.1->phik>=0.9.8->pandas-profiling) (1.4.2)
Requirement already satisfied: defusedxml in /usr/local/lib/python3.7/site-packages (from nbconvert>=5.3.1->phik>=0.9.8->pandas-profiling) (0.6.0)
Requirement already satisfied: nbformat>=4.4 in /usr/local/lib/python3.7/site-packages (from nbconvert>=5.3.1->phik>=0.9.8->pandas-profiling) (4.4.0)
Requirement already satisfied: mistune<2,>=0.8.1 in /usr/local/lib/python3.7/site-packages (from nbconvert>=5.3.1->phik>=0.9.8->pandas-profiling) (0.8.4)
Requirement already satisfied: entrypoints>=0.2.2 in /usr/local/lib/python3.7/site-packages (from nbconvert>=5.3.1->phik>=0.9.8->pandas-profiling) (0.3)
Requirement already satisfied: bleach in /usr/local/lib/python3.7/site-packages (from nbconvert>=5.3.1->phik>=0.9.8->pandas-profiling) (3.1.0)
Requirement already satisfied: testpath in /usr/local/lib/python3.7/site-packages (from nbconvert>=5.3.1->phik>=0.9.8->pandas-profiling) (0.4.2)
Requirement already satisfied: pluggy<1.0,>=0.12 in /usr/local/lib/python3.7/site-packages (from pytest>=4.0.2->phik>=0.9.8->pandas-profiling) (0.13.0)
Requirement already satisfied: py>=1.5.0 in /usr/local/lib/python3.7/site-packages (from pytest>=4.0.2->phik>=0.9.8->pandas-profiling) (1.8.0)
Requirement already satisfied: wcwidth in /usr/local/lib/python3.7/site-packages (from pytest>=4.0.2->phik>=0.9.8->pandas-profiling) (0.1.7)
Requirement already satisfied: attrs>=17.4.0 in /usr/local/lib/python3.7/site-packages (from pytest>=4.0.2->phik>=0.9.8->pandas-profiling) (19.1.0)
Requirement already satisfied: more-itertools>=4.0.0 in /usr/local/lib/python3.7/site-packages (from pytest>=4.0.2->phik>=0.9.8->pandas-profiling) (7.2.0)
Requirement already satisfied: packaging in /usr/local/lib/python3.7/site-packages (from pytest>=4.0.2->phik>=0.9.8->pandas-profiling) (19.2)
Requirement already satisfied: importlib-metadata>=0.12; python_version < "3.8" in /usr/local/lib/python3.7/site-packages (from pytest>=4.0.2->phik>=0.9.8->pandas-profiling) (0.23)
Requirement already satisfied: atomicwrites>=1.0 in /usr/local/lib/python3.7/site-packages (from pytest>=4.0.2->phik>=0.9.8->pandas-profiling) (1.3.0)
Requirement already satisfied: llvmlite>=0.30.0dev0 in /usr/local/lib/python3.7/site-packages (from numba>=0.38.1->phik>=0.9.8->pandas-profiling) (0.30.0)
Requirement already satisfied: setuptools in /usr/local/lib/python3.7/site-packages (from kiwisolver>=1.0.1->matplotlib>=1.4->pandas-profiling) (41.6.0)
Requirement already satisfied: ipython-genutils in /usr/local/lib/python3.7/site-packages (from traitlets->jupyter-client>=5.2.3->phik>=0.9.8->pandas-profiling) (0.2.0)
Requirement already satisfied: decorator in /usr/local/lib/python3.7/site-packages (from traitlets->jupyter-client>=5.2.3->phik>=0.9.8->pandas-profiling) (4.4.0)
Requirement already satisfied: isort<5,>=4.2.5 in /usr/local/lib/python3.7/site-packages (from pylint>=1.4.5->pytest-pylint>=0.13.0->phik>=0.9.8->pandas-profiling) (4.3.21)
Requirement already satisfied: astroid<2.4,>=2.3.0 in /usr/local/lib/python3.7/site-packages (from pylint>=1.4.5->pytest-pylint>=0.13.0->phik>=0.9.8->pandas-profiling) (2.3.3)
Requirement already satisfied: mccabe<0.7,>=0.6 in /usr/local/lib/python3.7/site-packages (from pylint>=1.4.5->pytest-pylint>=0.13.0->phik>=0.9.8->pandas-profiling) (0.6.1)
Requirement already satisfied: jsonschema!=2.5.0,>=2.4 in /usr/local/lib/python3.7/site-packages (from nbformat>=4.4->nbconvert>=5.3.1->phik>=0.9.8->pandas-profiling) (3.0.2)
Requirement already satisfied: webencodings in /usr/local/lib/python3.7/site-packages (from bleach->nbconvert>=5.3.1->phik>=0.9.8->pandas-profiling) (0.5.1)
Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/site-packages (from importlib-metadata>=0.12; python_version < "3.8"->pytest>=4.0.2->phik>=0.9.8->pandas-profiling) (0.6.0)
Requirement already satisfied: wrapt==1.11.* in /usr/local/lib/python3.7/site-packages (from astroid<2.4,>=2.3.0->pylint>=1.4.5->pytest-pylint>=0.13.0->phik>=0.9.8->pandas-profiling) (1.11.2)
Requirement already satisfied: lazy-object-proxy==1.4.* in /usr/local/lib/python3.7/site-packages (from astroid<2.4,>=2.3.0->pylint>=1.4.5->pytest-pylint>=0.13.0->phik>=0.9.8->pandas-profiling) (1.4.3)
Requirement already satisfied: typed-ast<1.5,>=1.4.0; implementation_name == "cpython" and python_version < "3.8" in /usr/local/lib/python3.7/site-packages (from astroid<2.4,>=2.3.0->pylint>=1.4.5->pytest-pylint>=0.13.0->phik>=0.9.8->pandas-profiling) (1.4.0)
Requirement already satisfied: pyrsistent>=0.14.0 in /usr/local/lib/python3.7/site-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.4->nbconvert>=5.3.1->phik>=0.9.8->pandas-profiling) (0.15.4)
In [3]:
def preprocess(file):  
    data = [ x.replace('\n', ' ').split(' ')[:-1] for x in file.split('name') if '\x00' not in x  ]
    data = [x if len(x)==75 else x[1:] for x in data ] #all 75 attributes besides the name!
    data = [x  for x in data if len(x) ==75] # all rows shall have 75 columns!
    data = [[ a.replace('-9.', '-9') for a in x] for x in data]
    #print (data)
    if data[-1] == [''] : 
        data =  data[:-1]
    df = pd.DataFrame([[float (a) for a in x] for x in data])
    return df 
    

def load():
    df = pd.DataFrame()
    for file in glob.glob('*.data'): 
        if 'processed' not in file:
            print (file)
            f = open(file, encoding = 'ISO-8859-1').read()
            data = preprocess(f)
            #print (data)
            df = df.append(data)
    return df
    
In [4]:
df = load()
cleveland.data
switzerland.data
long-beach-va.data
In [5]:
df = df = df.loc[:, :67]

df.head()
Out[5]:
0 1 2 3 4 5 6 7 8 9 ... 58 59 60 61 62 63 64 65 66 67
0 1.0 0.0 63.0 1.0 -9.0 -9.0 -9.0 -9.0 1.0 145.0 ... 1.0 1.0 1.0 -9.0 1.0 -9.0 1.0 -9.0 1.0 1.0
1 2.0 0.0 67.0 1.0 -9.0 -9.0 -9.0 -9.0 4.0 160.0 ... 1.0 2.0 2.0 -9.0 2.0 -9.0 1.0 -9.0 1.0 1.0
2 3.0 0.0 67.0 1.0 -9.0 -9.0 -9.0 -9.0 4.0 120.0 ... 1.0 1.0 1.0 -9.0 1.0 -9.0 1.0 -9.0 2.0 2.0
3 4.0 0.0 37.0 1.0 -9.0 -9.0 -9.0 -9.0 3.0 130.0 ... 1.0 1.0 1.0 -9.0 1.0 -9.0 1.0 -9.0 1.0 1.0
4 6.0 0.0 41.0 0.0 -9.0 -9.0 -9.0 -9.0 2.0 130.0 ... 1.0 1.0 1.0 -9.0 1.0 -9.0 1.0 -9.0 1.0 1.0

5 rows × 68 columns

We want to predict 'Beta Blocker' property (Beta blocker used during exercise ECG: 1 = yes; 0 = no). The column index is 23

The attributes we take as our data for the model are:

- Age (in years) index 2
- Gender(sex) index 3
- Cholestrol level serum cholestoral in mg/dl index 11
- Number of years as a smoker  years index 14
In [6]:
X = pd.DataFrame({'age': df.iloc[:,2], 'Gender': df.iloc[:,3], 'Cholesterol': df.iloc[:,11],
                  'Num_years_smoke': df.iloc[:, 14], 'Beta_blocker': df.iloc[:,23]})
X.head()
Out[6]:
age Gender Cholesterol Num_years_smoke Beta_blocker
0 63.0 1.0 233.0 20.0 0.0
1 67.0 1.0 286.0 40.0 1.0
2 67.0 1.0 229.0 35.0 1.0
3 37.0 1.0 250.0 0.0 1.0
4 41.0 0.0 204.0 0.0 0.0
In [7]:
X.isna().any()
Out[7]:
age                False
Gender             False
Cholesterol        False
Num_years_smoke    False
Beta_blocker       False
dtype: bool
In [8]:
import pandas_profiling
X.profile_report()
Out[8]:

In [9]:
X = X[X>=0].dropna()
In [10]:
import pandas_profiling
X.profile_report()
Out[10]:

Compare GMM and Kmeans for the heart dataset via the silhouette index and corresponding plots

Wikipedia definition!

Silhouette refers to a method of interpretation and validation of consistency within clusters of data. The technique provides a succinct graphical representation of how well each object has been classified.[1]

The silhouette value is a measure of how similar an object is to its own cluster (cohesion) compared to other clusters (separation). The silhouette ranges from −1 to +1, where a high value indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. If most objects have a high value, then the clustering configuration is appropriate. If many points have a low or negative value, then the clustering configuration may have too many or too few clusters.

The silhouette can be calculated with any distance metric, such as the Euclidean distance or the Manhattan distance.

In [11]:
X
Out[11]:
age Gender Cholesterol Num_years_smoke Beta_blocker
0 63.0 1.0 233.0 20.0 0.0
1 67.0 1.0 286.0 40.0 1.0
2 67.0 1.0 229.0 35.0 1.0
3 37.0 1.0 250.0 0.0 1.0
4 41.0 0.0 204.0 0.0 0.0
... ... ... ... ... ...
193 62.0 1.0 170.0 20.0 22.0
194 46.0 1.0 310.0 21.0 0.0
195 54.0 0.0 333.0 0.0 1.0
197 55.0 1.0 223.0 40.0 1.0
199 62.0 1.0 254.0 0.0 1.0

404 rows × 5 columns

In [12]:
X = X[X['Cholesterol']>0] # since we only want to take cholesterol values >0 in to account
In [13]:
from sklearn.decomposition import PCA
In [14]:
pca = PCA(n_components=2)
pca.fit(X)  
Out[14]:
PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)
In [15]:
X[['pc1', 'pc2']] = pd.DataFrame(pca.fit_transform(X))
In [16]:
X
Out[16]:
age Gender Cholesterol Num_years_smoke Beta_blocker pc1 pc2
0 63.0 1.0 233.0 20.0 0.0 -13.336945 3.357135
1 67.0 1.0 286.0 40.0 1.0 39.497921 24.142976
2 67.0 1.0 229.0 35.0 1.0 -17.423494 18.597356
3 37.0 1.0 250.0 0.0 1.0 3.292724 -18.579041
4 41.0 0.0 204.0 0.0 0.0 -42.595483 -18.705323
... ... ... ... ... ... ... ...
193 62.0 1.0 170.0 20.0 22.0 -2.804404 16.321204
194 46.0 1.0 310.0 21.0 0.0 26.436461 13.381104
195 54.0 0.0 333.0 0.0 1.0 7.180354 17.416487
197 55.0 1.0 223.0 40.0 1.0 -120.201952 -18.130962
199 62.0 1.0 254.0 0.0 1.0 -35.792662 1.465656

370 rows × 7 columns

In [17]:
import silhouette as si
In [18]:
si
Out[18]:
<module 'silhouette' from '/Users/shreyakapoor/Life_Science_Informatics/WiSe19/MLinLSI/silhouette.py'>
In [19]:
si.silhouette_method('k-means', X, 'pc1', 'pc2')
For n_clusters = 2 The average silhouette_score is : 0.4746306637480278
For n_clusters = 3 The average silhouette_score is : 0.4189044952210279
For n_clusters = 4 The average silhouette_score is : 0.4228157912679556
For n_clusters = 5 The average silhouette_score is : 0.36253292350100513
For n_clusters = 6 The average silhouette_score is : 0.347168769241494
In [20]:
si.silhouette_method('GMM', X, 'pc1', 'pc2')
For n_clusters = 2 The average silhouette_score is : 0.1530151199118994
For n_clusters = 3 The average silhouette_score is : 0.15729098429189328
For n_clusters = 4 The average silhouette_score is : 0.13150745642965433
For n_clusters = 5 The average silhouette_score is : 0.026397503998478122
For n_clusters = 6 The average silhouette_score is : 0.1873507524584749

The trend for the silhoutte index is similar for both KNN and GMM in this particular case of age and cholesterol!

In [ ]: